{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ce1c75bf-4735-42cd-92f5-8bd846856738",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "Bad key savefig.frameon in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 421 ('savefig.frameon : True')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "\n",
      "Bad key verbose.level in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 472 ('verbose.level  : silent      # one of silent, helpful, debug, debug-annoying')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "\n",
      "Bad key verbose.fileo in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 473 ('verbose.fileo  : sys.stdout  # a log filename, sys.stdout or sys.stderr')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "import scipy.sparse as sparse\n",
    "from scipy.stats import bernoulli, poisson\n",
    "import analysis_utils_mine as utils\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import ast\n",
    "from datetime import datetime\n",
    "import torch\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "import pickle\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import xlsxwriter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b0a0a14a-0dc7-4b21-b3ae-4e831cbe76b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ebfc7f84-457f-4a29-89ed-20a898f6f643",
   "metadata": {},
   "outputs": [],
   "source": [
    "# floor speeches\n",
    "\n",
    "project_dir = os.path.abspath('../..//data/floor_speeches_congs_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_fits_removed_procedural_speeches\")\n",
    "#source_dir = os.path.join(project_dir, \"data/synthetic\")\n",
    "\n",
    "# Load TBIP data.\n",
    "data_dir = os.path.join(project_dir, \"clean_removing_procedural\")\n",
    "(counts_speeches, vocabulary_speeches, author_indices_speeches, \n",
    " author_map_speeches) = utils.load_text_data(data_dir)\n",
    "\n",
    "# Load TBIP parameters.\n",
    "param_dir = os.path.join(project_dir, \"tbip-pytorch-fits-og-rem-procedural-speeches-k50-init-mallet/params/\")\n",
    "(_, _, objective_topic_loc_speeches, objective_topic_scale_speeches, \n",
    " ideological_topic_loc_speeches, ideological_topic_scale_speeches, ideal_point_loc_speeches, \n",
    " ideal_point_scale_speeches) = utils.load_tbip_parameters(param_dir)\n",
    "\n",
    "# Compute means from variational parameters\n",
    "#document_mean_speeches = np.exp(document_loc + document_scale ** 2 / 2)\n",
    "objective_topic_mean_speeches = np.exp(objective_topic_loc_speeches + \n",
    "                              objective_topic_scale_speeches ** 2 / 2)\n",
    "ideological_topic_mean_speeches = ideological_topic_loc_speeches\n",
    "ideal_point_mean_speeches = ideal_point_loc_speeches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "151a7096-a567-4e5a-8bfb-d9267b62b153",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tweets\n",
    "\n",
    "project_dir = os.path.abspath('../../data/tweets_cong_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_results/tbip_expanded_preprocessing_k50\")\n",
    "#source_dir = os.path.join(project_dir, \"data/synthetic\")\n",
    "\n",
    "# Load TBIP data.\n",
    "data_dir = os.path.join(project_dir, \"clean2\")\n",
    "(counts_tweets, vocabulary_tweets, author_indices_tweets, \n",
    " author_map_tweets) = utils.load_text_data(data_dir)\n",
    "\n",
    "# Load TBIP parameters.\n",
    "param_dir = os.path.join(project_dir, \"tbip-og-k50-expanded-vocab-with-mallet-scaled-topics/params/\")\n",
    "(_, _, objective_topic_loc_tweets, objective_topic_scale_tweets, \n",
    " ideological_topic_loc_tweets, ideological_topic_scale_tweets, ideal_point_loc_tweets, \n",
    " ideal_point_scale_tweets) = utils.load_tbip_parameters(param_dir)\n",
    "\n",
    "# Compute means from variational parameters\n",
    "#document_mean_speeches = np.exp(document_loc + document_scale ** 2 / 2)\n",
    "objective_topic_mean_tweets = np.exp(objective_topic_loc_tweets + \n",
    "                              objective_topic_scale_tweets ** 2 / 2)\n",
    "ideological_topic_mean_tweets = ideological_topic_loc_tweets\n",
    "ideal_point_mean_tweets = ideal_point_loc_tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b2d58bbf-f76f-418a-bf8c-56338def5e7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ideological_topics(objective_topic_loc, \n",
    "                           objective_topic_scale,\n",
    "                           ideological_topic_loc, \n",
    "                           ideological_topic_scale,\n",
    "                           ideal_point):\n",
    "    ideological_topic_mean = np.exp(objective_topic_loc +\n",
    "                              ideal_point * ideological_topic_loc +\n",
    "                              (objective_topic_scale ** 2 + \n",
    "                               ideal_point ** 2 * \n",
    "                               ideological_topic_scale ** 2) / 2)\n",
    "    return ideological_topic_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a3d9bd13-88d3-4c0c-9817-93a570bacfa6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 11433)\n",
      "(50, 11433)\n"
     ]
    }
   ],
   "source": [
    "ideological_topic_speeches_minus1 = get_ideological_topics(objective_topic_loc_speeches, \n",
    "                                                           objective_topic_scale_speeches,\n",
    "                                                           ideological_topic_loc_speeches, \n",
    "                                                           ideological_topic_scale_speeches,\n",
    "                                                           -1.0)\n",
    "print(ideological_topic_speeches_minus1.shape)\n",
    "ideological_topic_speeches_plus1 = get_ideological_topics(objective_topic_loc_speeches, \n",
    "                                                           objective_topic_scale_speeches,\n",
    "                                                           ideological_topic_loc_speeches, \n",
    "                                                           ideological_topic_scale_speeches,\n",
    "                                                           1.0)\n",
    "print(ideological_topic_speeches_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4cc097fe-2341-4229-b4b7-8d2ae38dfb8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 9343)\n",
      "(50, 9343)\n"
     ]
    }
   ],
   "source": [
    "ideological_topic_tweets_minus1 = get_ideological_topics(objective_topic_loc_tweets, \n",
    "                                                           objective_topic_scale_tweets,\n",
    "                                                           ideological_topic_loc_tweets, \n",
    "                                                           ideological_topic_scale_tweets,\n",
    "                                                           -1.0)\n",
    "print(ideological_topic_tweets_minus1.shape)\n",
    "ideological_topic_tweets_plus1 = get_ideological_topics(objective_topic_loc_tweets, \n",
    "                                                           objective_topic_scale_tweets,\n",
    "                                                           ideological_topic_loc_tweets, \n",
    "                                                           ideological_topic_scale_tweets,\n",
    "                                                           1.0)\n",
    "print(ideological_topic_tweets_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a92fb254-6977-4f7b-b129-dd0e030e72d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary_tweets = list(vocabulary_tweets)\n",
    "vocabulary_speeches = list(vocabulary_speeches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c478ba31-d946-417f-898b-b15b15807f73",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rescale_to_probs_renorm(arr):\n",
    "    return arr/arr.sum(1, keepdims=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f35bdd58-9fb6-4dfb-80b7-638096c0fe4f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 11433)\n",
      "(50, 11433)\n",
      "(50, 9343)\n",
      "(50, 9343)\n"
     ]
    }
   ],
   "source": [
    "# rescaling to make them probs \n",
    "ideological_topic_speeches_minus1 = rescale_to_probs_renorm(ideological_topic_speeches_minus1)\n",
    "print(ideological_topic_speeches_minus1.shape)\n",
    "\n",
    "ideological_topic_speeches_plus1 = rescale_to_probs_renorm(ideological_topic_speeches_plus1)\n",
    "print(ideological_topic_speeches_plus1.shape)\n",
    "\n",
    "ideological_topic_tweets_minus1 = rescale_to_probs_renorm(ideological_topic_tweets_minus1)\n",
    "print(ideological_topic_tweets_minus1.shape)\n",
    "\n",
    "ideological_topic_tweets_plus1 = rescale_to_probs_renorm(ideological_topic_tweets_plus1)\n",
    "print(ideological_topic_tweets_plus1.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "111a2a85-1ac1-4776-b1e7-bdec87a6c990",
   "metadata": {},
   "source": [
    "### Get consensus topic labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e5a1ce49-23fe-4a10-8a92-15303b80651f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 49 entries, 0 to 48\n",
      "Data columns (total 10 columns):\n",
      " #   Column                Non-Null Count  Dtype  \n",
      "---  ------                --------------  -----  \n",
      " 0   Topic                 49 non-null     object \n",
      " 1   Name 1                49 non-null     object \n",
      " 2   Description 1         48 non-null     object \n",
      " 3   Notes 1               9 non-null      object \n",
      " 4   Name 2                49 non-null     object \n",
      " 5   Description 2         42 non-null     object \n",
      " 6   Notes 2               16 non-null     object \n",
      " 7   Unnamed: 7            0 non-null      float64\n",
      " 8   Consensus Topic Name  49 non-null     object \n",
      " 9   Notes/Comments        14 non-null     object \n",
      "dtypes: float64(1), object(9)\n",
      "memory usage: 4.0+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "consensus_labels_speech = pd.read_excel('venue_diff_polsci/consensus_topic_labeling_files/results/speeches_consensus_labeling.xlsx',\n",
    "                                       sheet_name=None,\n",
    "                                        engine='openpyxl')['Sheet1']\n",
    "print(consensus_labels_speech.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "010b0071-43df-4f7e-adb0-3871b7c3c7af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50 entries, 0 to 49\n",
      "Data columns (total 10 columns):\n",
      " #   Column                Non-Null Count  Dtype  \n",
      "---  ------                --------------  -----  \n",
      " 0   Topic                 50 non-null     object \n",
      " 1   Name 1                50 non-null     object \n",
      " 2   Description 1         50 non-null     object \n",
      " 3   Notes 1               50 non-null     object \n",
      " 4   Name 2                50 non-null     object \n",
      " 5   Description 2         38 non-null     object \n",
      " 6   Notes 2               22 non-null     object \n",
      " 7   Unnamed: 7            0 non-null      float64\n",
      " 8   Consensus Topic Name  50 non-null     object \n",
      " 9   Notes/Comments        16 non-null     object \n",
      "dtypes: float64(1), object(9)\n",
      "memory usage: 4.0+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "consensus_labels_tweet = pd.read_excel('venue_diff_polsci/consensus_topic_labeling_files/results/tweets_consensus_labeling.xlsx',\n",
    "                                       sheet_name=None,\n",
    "                                        engine='openpyxl')['Sheet1']\n",
    "print(consensus_labels_tweet.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5df59cd7-a50b-40c2-8a55-15bf22edd217",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_to_discard = ['DISCARD', 'Congratulations', 'Honoring Service Members', 'Member Votes', 'Memorial Speech', \n",
    "                     'Motions', 'Yielding', 'Congratulatory Messages', 'Constituent Outreach',\n",
    "                     'Dear Colleague and Newsletters', 'Live Proceedings', \n",
    "                     'Local Constituent Services', 'Social Media', 'Town Hall Meetings', 'Tributes']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f3d4e113-6c7f-4171-a8b6-66b9be4fda51",
   "metadata": {},
   "outputs": [],
   "source": [
    "speech_topics_to_labels = {}\n",
    "speech_c_ts = list(consensus_labels_speech['Topic'])\n",
    "speech_c_labels = list(consensus_labels_speech['Consensus Topic Name'])\n",
    "for t, l in zip(speech_c_ts, speech_c_labels):\n",
    "    speech_topics_to_labels[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "76eb9a9c-ff5a-4033-8780-7c20a700bec9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topics_to_labels = {}\n",
    "tweet_c_ts = list(consensus_labels_tweet['Topic'])\n",
    "tweet_c_labels = list(consensus_labels_tweet['Consensus Topic Name'])\n",
    "for t, l in zip(tweet_c_ts, tweet_c_labels):\n",
    "    tweet_topics_to_labels[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "54ae8c1a-1fc8-4589-beaf-25e61d100688",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "selected_speech_topics_after_discarding_based_on_labels = [t for t in speech_topics_to_labels if speech_topics_to_labels[t] not in labels_to_discard]\n",
    "print(len(selected_speech_topics_after_discarding_based_on_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5c6c41f9-8a8a-4254-82c0-f94dbfb21842",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "selected_tweet_topics_after_discarding_based_on_labels = [t for t in tweet_topics_to_labels if tweet_topics_to_labels[t] not in labels_to_discard]\n",
    "print(len(selected_tweet_topics_after_discarding_based_on_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bbcc5fae-5c09-47a8-bef9-17271640aa4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dataframe_from_annotated_xlsx_file_path(path):\n",
    "    df = pd.read_excel(path,\n",
    "                       sheet_name=None,\n",
    "                       engine='openpyxl')\n",
    "    df = df['Sheet1']\n",
    "    df = df[~pd.isnull(df['Topic Name'])]\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1953084d-3183-482d-b911-9f1e1b26d696",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator1_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_frazier_speech.xlsx')\n",
    "annotator2_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_speech.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2250e007-0016-4cad-a6e2-1cf7c3d659d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "speeches_topic_to_coherence1 = {}\n",
    "speech_ts1 = list(annotator1_speeches['Topic'])\n",
    "speech_coherence1 = list(annotator1_speeches['Coherence'])\n",
    "for t, l in zip(speech_ts1, speech_coherence1):\n",
    "    speeches_topic_to_coherence1[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "10d63e60-5244-4a86-b005-86307bee6b78",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_speeches1 = [t for t in speeches_topic_to_coherence1 if speeches_topic_to_coherence1[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3378123e-3bf9-41c9-97fb-1113934f798b",
   "metadata": {},
   "outputs": [],
   "source": [
    "speeches_topic_to_coherence2 = {}\n",
    "speech_ts2 = list(annotator2_speeches['Topic'])\n",
    "speech_coherence2 = list(annotator2_speeches['Coherence'])\n",
    "for t, l in zip(speech_ts2, speech_coherence2):\n",
    "    speeches_topic_to_coherence2[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "16ebe496-bdf4-4f0f-bcc8-3de4aa6a1a1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_speeches2 = [t for t in speeches_topic_to_coherence2 if speeches_topic_to_coherence2[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "11331211-16b0-4c33-aa95-20b3b15ce958",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15, 'Annotator 2 rated coherence as 1.0')\n",
      "International Human Rights\n",
      "(31, 'Annotator 2 rated coherence as 1.0')\n",
      "Support for Legislation\n",
      "(35, 'Annotator 2 rated coherence as 1.0')\n",
      "Elections and Appointments\n",
      "(46, 'Annotator 2 rated coherence as 1.0')\n",
      "Funding\n"
     ]
    }
   ],
   "source": [
    "topics_to_consider_removing_speeches1 = []\n",
    "for t in selected_speech_topics_after_discarding_based_on_labels:\n",
    "    if t in discard_based_on_coherence_speeches1:\n",
    "        print((t, 'Annotator 1 rated coherence as 1.0'))\n",
    "        print(speech_topics_to_labels[t])\n",
    "    if t in discard_based_on_coherence_speeches2:\n",
    "        print((t, 'Annotator 2 rated coherence as 1.0'))\n",
    "        print(speech_topics_to_labels[t])\n",
    "    #print('---')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0062ef0d-6d1c-4fe8-b3b0-ff3abc00bd5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16]\n"
     ]
    }
   ],
   "source": [
    "topics_to_dicard_based_on_coherence_score_speeches = list(set(discard_based_on_coherence_speeches1).intersection(set(discard_based_on_coherence_speeches2)))\n",
    "print(topics_to_dicard_based_on_coherence_score_speeches)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5151617-f1b6-4f75-9538-4ee034424fc4",
   "metadata": {},
   "source": [
    "### Current strategy to form finalized list of topics included for post-tbip annotation\n",
    "1. Discarding some on consensul labels - DISCARD, and some non-substantive-issue ones like procedural, congratulatory, etc.\n",
    "2. Discarding those that were rated 1 for coherence by BOTH annotators (should we discard those rated as 1 for coherence by 1/2 annotators?)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "79974cac-6957-4968-a697-3a39ce729b00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_inds = [t for t in selected_speech_topics_after_discarding_based_on_labels if t not in topics_to_dicard_based_on_coherence_score_speeches]\n",
    "print(len(final_selected_speech_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "3b9d413f-2ad0-4ddc-bc45-cf437863b51f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_labels = [speech_topics_to_labels[t] for t in final_selected_speech_topic_inds]\n",
    "print(len(final_selected_speech_topic_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "97b43849-3470-43bb-8946-bbd60f616d37",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('venue_diff_polsci/floor_speeches/post_tbip_output_labeling/final_selected_speech_topic_inds.npy',\n",
    "        final_selected_speech_topic_inds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80b60e5c-d45f-4dce-bc4c-815db9a0db42",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0e33bbca-51f8-4487-a370-1bc95944328e",
   "metadata": {},
   "outputs": [],
   "source": [
    "annotator1_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_Frazier_tweet.xlsx')\n",
    "annotator2_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_tweets.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e0550829-b13d-4088-97d6-aac5e13623b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topic_to_coherence1 = {}\n",
    "tweet_ts1 = list(annotator1_tweets['Topic'])\n",
    "tweet_coherence1 = list(annotator1_tweets['Coherence'])\n",
    "for t, l in zip(tweet_ts1, tweet_coherence1):\n",
    "    tweet_topic_to_coherence1[int(t.split()[1])] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "105c024f-19ed-4ab1-980a-9c8f0841aad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_tweets1 = [t for t in tweet_topic_to_coherence1 if tweet_topic_to_coherence1[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b831ea53-34a0-4894-97c1-c3fc7118c69b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topic_to_coherence2 = {}\n",
    "tweet_ts2 = list(annotator2_tweets['Topic'])\n",
    "tweet_coherence2 = list(annotator2_tweets['Coherence'])\n",
    "for t, l in zip(tweet_ts2, tweet_coherence2):\n",
    "    tweet_topic_to_coherence2[int(t.split()[1])] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "b17f45d3-6ec7-4a11-a952-f5896faa289b",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_tweets2 = [t for t in tweet_topic_to_coherence2 if tweet_topic_to_coherence2[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "fddeee08-eb28-4f7d-86d9-3b13f6b52b39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4, 'Annotator 2 rated coherence as 1.0')\n",
      "Holidays and Events\n",
      "(12, 'Annotator 2 rated coherence as 1.0')\n",
      "Promoting Unity\n",
      "(19, 'Annotator 2 rated coherence as 1.0')\n",
      "Legislation\n",
      "(21, 'Annotator 2 rated coherence as 1.0')\n",
      "Public Health\n",
      "(24, 'Annotator 2 rated coherence as 1.0')\n",
      "Job Market\n",
      "(25, 'Annotator 2 rated coherence as 1.0')\n",
      "Registration\n",
      "(26, 'Annotator 2 rated coherence as 1.0')\n",
      "Civil Rights of Gender and Sexuality\n",
      "(37, 'Annotator 2 rated coherence as 1.0')\n",
      "Job Creation\n",
      "(40, 'Annotator 2 rated coherence as 1.0')\n",
      "Border Wall\n"
     ]
    }
   ],
   "source": [
    "#topics_to_consider_removing_speeches1 = []\n",
    "for t in selected_tweet_topics_after_discarding_based_on_labels:\n",
    "    if t in discard_based_on_coherence_tweets1:\n",
    "        print((t, 'Annotator 1 rated coherence as 1.0'))\n",
    "        print(tweet_topics_to_labels[t])\n",
    "    if t in discard_based_on_coherence_tweets2:\n",
    "        print((t, 'Annotator 2 rated coherence as 1.0'))\n",
    "        print(tweet_topics_to_labels[t])\n",
    "    #print('---')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "4fa9d2d2-bc53-42c0-a0a2-bfd953c94f80",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    }
   ],
   "source": [
    "topics_to_dicard_based_on_coherence_score_tweets = list(set(discard_based_on_coherence_tweets1).intersection(set(discard_based_on_coherence_tweets2)))\n",
    "print(topics_to_dicard_based_on_coherence_score_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "800508f3-3610-4636-a5d9-6c909abdb470",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "final_selected_tweet_topic_inds = [t for t in selected_tweet_topics_after_discarding_based_on_labels if t not in topics_to_dicard_based_on_coherence_score_tweets]\n",
    "print(len(final_selected_tweet_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "6408a991-9ab3-4a0c-b2d5-bc306abd6c62",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "final_selected_tweet_topic_labels = [tweet_topics_to_labels[t] for t in final_selected_tweet_topic_inds]\n",
    "print(len(final_selected_tweet_topic_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86872ab5-f8a4-4e04-b5d2-0827adea6fc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('venue_diff_polsci/twitter/post_tbip_output_labeling/final_selected_tweet_topic_inds.npy',\n",
    "        final_selected_tweet_topic_inds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "8a963c3c-f72b-4627-bd67-b1f3d2b88d04",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "id": "f44c0828-28ee-4427-984f-e6105bf0de23",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a']"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.Random(42).sample(['a', 'b'], 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "a34d58ab-149c-4b96-b050-6568a8eccb28",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_topics_file_for_human_labeling_and_rating(topic_word_minus1,\n",
    "                                                     topic_word_plus1,\n",
    "                                                     vocab,\n",
    "                                                     outpath,\n",
    "                                                     selected_topic_inds,\n",
    "                                                     selected_topic_labels,\n",
    "                                                     num_top_words = 20):\n",
    "    workbook = xlsxwriter.Workbook(outpath + 'annotation_file.xlsx')\n",
    "    workbook.formats[0].set_font_size(12)\n",
    "    worksheet = workbook.add_worksheet()\n",
    "    worksheet.freeze_panes(1, 0)\n",
    "    \n",
    "    # Add a format for the header cells.\n",
    "    header_format = workbook.add_format({\n",
    "        'bottom': 5,\n",
    "        'top':5,\n",
    "        'font_size':12,\n",
    "        #'bg_color': '#C6EFCE',\n",
    "        'bold': True,\n",
    "        'text_wrap': True,\n",
    "        #'valign': 'center',\n",
    "        'align': 'center',\n",
    "        #'indent': 1,\n",
    "    })\n",
    "    \n",
    "    n_labels = len(selected_topic_labels)\n",
    "    \n",
    "    worksheet.set_row(0, 40)\n",
    "\n",
    "    # Set up layout of the worksheet.\n",
    "    worksheet.set_column('A:A', 25)\n",
    "    worksheet.set_column('B:B', 10)\n",
    "    worksheet.set_column('C:C', 15)\n",
    "    worksheet.set_column('D:D', 25)\n",
    "    worksheet.set_column('E:E', 10)\n",
    "    worksheet.set_column('F:F', 3)\n",
    "    worksheet.set_column('G:G', 30)\n",
    "    worksheet.set_column('H:H', 30)\n",
    "    worksheet.set_column('I:I', 50)\n",
    "    worksheet.set_column('J:J', 25)\n",
    "    worksheet.set_column('K:K', 25)\n",
    "    worksheet.set_column('L:L', 150)\n",
    "    #max_rows = (num_top_words+3)*n_labels + 10\n",
    "    #print(max_rows)\n",
    "    #worksheet.set_row(0, max_rows)\n",
    "\n",
    "    # Write the header cells and some data that will be used in the examples.\n",
    "    worksheet.write('A1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('B1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('C1', \n",
    "                    'Issue', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('D1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('E1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('F1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('G1', \n",
    "                    'Label Applicability for a)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('H1', \n",
    "                    'Label Applicability for b)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('I1', \n",
    "                    'Ideological Polarization', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('J1', \n",
    "                    'Ideological Position expressed in a)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('K1', \n",
    "                    'Ideological Position expressed in b)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('L1', \n",
    "                    'Notes/Comments', \n",
    "                    header_format)\n",
    "    \n",
    "    border = workbook.add_format({'top': 2,\n",
    "                                  'bottom': 2})\n",
    "    border_plus_highlighting = workbook.add_format({'top': 2,\n",
    "                                                    'bottom': 2,\n",
    "                                                    'left': 1,\n",
    "                                                    'right': 1,\n",
    "                                                    'bg_color': '#FFFFEC'})\n",
    "    \n",
    "    #issue1_row_ind = 3\n",
    "    random_seeds = [i + 1 for i in range(len(selected_topic_inds))]\n",
    "    num_topics, num_words = topic_word_minus1.shape\n",
    "    topic_ind_to_a_b_info = {}\n",
    "    for k in selected_topic_inds:\n",
    "        topic_ind_to_a_b_info[k] = {}\n",
    "    \n",
    "    on_issue = 0\n",
    "    \n",
    "    while on_issue < n_labels:\n",
    "        #print(on_issue)\n",
    "        k = selected_topic_inds[on_issue]\n",
    "        seed = random_seeds[on_issue]\n",
    "        issue_row = ((num_top_words+3)*on_issue) + 3\n",
    "        for c in ['A', 'B', 'D', 'E', 'F', 'L']:\n",
    "            worksheet.write(c + str(issue_row), '', border)\n",
    "        for c in ['G', 'H', 'I', 'J', 'K']:\n",
    "            worksheet.write(c + str(issue_row), '', border_plus_highlighting)\n",
    "            \n",
    "        worksheet.write('C' + str(issue_row), 'Issue ' + str(on_issue + 1), border)\n",
    "        \n",
    "        worksheet.write('A' + str(issue_row + 1), 'a)')\n",
    "        worksheet.set_row(issue_row, None, None, {'collapsed': True})\n",
    "        worksheet.write('D' + str(issue_row + 1), 'b)')\n",
    "        \n",
    "        a_choice = random.Random(seed).sample([-1, 1], 1)[0]\n",
    "        \n",
    "        if a_choice == -1:\n",
    "            topic_ind_to_a_b_info[k]['a'] = -1\n",
    "            topic_ind_to_a_b_info[k]['b'] = 1\n",
    "            top_word_inds_a = np.argsort(list(topic_word_minus1[k]))[::-1][:num_top_words]\n",
    "            top_words_a = [vocab[i] for i in top_word_inds_a]\n",
    "            top_word_probs_a = [topic_word_minus1[k][i] for i in top_word_inds_a]\n",
    "            top_word_inds_b = np.argsort(list(topic_word_plus1[k]))[::-1][:num_top_words]\n",
    "            top_words_b = [vocab[i] for i in top_word_inds_b]\n",
    "            top_word_probs_b = [topic_word_plus1[k][i] for i in top_word_inds_b]\n",
    "            \n",
    "            for ii in range(2, num_top_words + 2):\n",
    "                worksheet.write('A' + str(issue_row + ii), top_words_a[ii-2])\n",
    "                worksheet.write('B' + str(issue_row + ii), top_word_probs_a[ii-2])\n",
    "                \n",
    "                worksheet.write('D' + str(issue_row + ii), top_words_b[ii-2])\n",
    "                worksheet.write('E' + str(issue_row + ii), top_word_probs_b[ii-2])\n",
    "                \n",
    "                worksheet.set_row(issue_row + ii - 1, None, None, {'level': 1, 'hidden': True})\n",
    "            \n",
    "            worksheet.conditional_format('B' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            worksheet.conditional_format('E' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            \n",
    "            \n",
    "        elif a_choice == 1:\n",
    "            topic_ind_to_a_b_info[k]['a'] = 1\n",
    "            topic_ind_to_a_b_info[k]['b'] = -1\n",
    "            top_word_inds_b = np.argsort(list(topic_word_minus1[k]))[::-1][:num_top_words]\n",
    "            top_words_b = [vocab[i] for i in top_word_inds_b]\n",
    "            top_word_probs_b = [topic_word_minus1[k][i] for i in top_word_inds_b]\n",
    "            top_word_inds_a = np.argsort(list(topic_word_plus1[k]))[::-1][:num_top_words]\n",
    "            top_words_a = [vocab[i] for i in top_word_inds_a]\n",
    "            top_word_probs_a = [topic_word_plus1[k][i] for i in top_word_inds_a]\n",
    "            \n",
    "            for ii in range(2, num_top_words + 2):\n",
    "                worksheet.write('A' + str(issue_row + ii), top_words_a[ii-2])\n",
    "                worksheet.write('B' + str(issue_row + ii), top_word_probs_a[ii-2])\n",
    "                \n",
    "                worksheet.write('D' + str(issue_row + ii), top_words_b[ii-2])\n",
    "                worksheet.write('E' + str(issue_row + ii), top_word_probs_b[ii-2])\n",
    "                \n",
    "                worksheet.set_row(issue_row + ii - 1, None, None, {'level': 1, 'hidden': True})\n",
    "                \n",
    "            worksheet.conditional_format('B' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            worksheet.conditional_format('E' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "        \n",
    "        worksheet.data_validation('G' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['a): IS about ' + selected_topic_labels[on_issue], \n",
    "                                             'a): MIGHT be about ' + selected_topic_labels[on_issue], \n",
    "                                             'a): IS NOT about ' + selected_topic_labels[on_issue]]})\n",
    "        worksheet.data_validation('H' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['b): IS about ' + selected_topic_labels[on_issue], \n",
    "                                             'b): MIGHT be about ' + selected_topic_labels[on_issue], \n",
    "                                             'b): IS NOT about ' + selected_topic_labels[on_issue]]})\n",
    "        \n",
    "        \n",
    "        #need to adjust for below I column because exceeding excel character limits for list validation for certain long topic names\n",
    "        \n",
    "        persp_list = ['a) and b) represent polarized perspectives on the issue: ' + selected_topic_labels[on_issue], \n",
    "                      'a) and b) represent SOMEWHAT polarized perspectives on above issue',# + selected_topic_labels[on_issue], \n",
    "                      'a) and b) DO NOT represent polarized perspectives on above issue',# + selected_topic_labels[on_issue],\n",
    "                      'Unsure']\n",
    "#         pp = 50 + (on_issue*4) + ((num_top_words+3)*n_labels)\n",
    "#         for persp in persp_list:\n",
    "#             worksheet.write(\"M{}\".format(pp), persp)\n",
    "#             pp = pp + 1\n",
    "        worksheet.data_validation('I' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                   'source': persp_list})\n",
    "                                  #'source': '=Details!$M$' + str(pp-4) + ':$M$' + str(pp-1)})\n",
    "        \n",
    "        \n",
    "        worksheet.data_validation('J' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['Liberal', \n",
    "                                             'Unsure', \n",
    "                                             'Conservative',\n",
    "                                             'N/A']})\n",
    "        worksheet.data_validation('K' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['Liberal', \n",
    "                                             'Unsure', \n",
    "                                             'Conservative',\n",
    "                                             'N/A']})\n",
    "        on_issue += 1\n",
    "        \n",
    "    workbook.close()\n",
    "    return topic_ind_to_a_b_info\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "id": "f10d494b-6023-42e1-af39-8356fa18d6e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_ind_to_a_b_info_speech = create_topics_file_for_human_labeling_and_rating(ideological_topic_speeches_minus1,\n",
    "                                                 ideological_topic_speeches_plus1,\n",
    "                                                 vocabulary_speeches,\n",
    "                                                 'venue_diff_polsci/floor_speeches/post_tbip_output_labeling/',\n",
    "                                                 final_selected_speech_topic_inds,\n",
    "                                                 final_selected_speech_topic_labels\n",
    "                                                )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "id": "c98adf8c-8d64-4609-b5f0-a8d0f8ed16ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(topic_ind_to_a_b_info_speech,\n",
    "            open('venue_diff_polsci/floor_speeches/topic_ind_to_a_b_info_post_tbip.pkl', \n",
    "                 'wb'))\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "id": "b383d6f6-3135-40b9-9af3-c19c7ba4c8b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_ind_to_a_b_info_tweet = create_topics_file_for_human_labeling_and_rating(ideological_topic_tweets_minus1,\n",
    "                                                 ideological_topic_tweets_plus1,\n",
    "                                                 vocabulary_tweets,\n",
    "                                                 'venue_diff_polsci/twitter/post_tbip_output_labeling/',\n",
    "                                                 final_selected_tweet_topic_inds,\n",
    "                                                 final_selected_tweet_topic_labels\n",
    "                                                )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "id": "4ac7abae-50eb-4335-ba69-ad30b9e866e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(topic_ind_to_a_b_info_tweet,\n",
    "            open('venue_diff_polsci/twitter/topic_ind_to_a_b_info_post_tbip.pkl', \n",
    "                 'wb'))\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "789c457c-84f7-48ea-93c7-6258c7f56f40",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tbip] *",
   "language": "python",
   "name": "conda-env-tbip-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
